Harvesting Web 2.0: An Introduction to API Processes in R

MAJ Ross Schuchard

ARCYBER

20 May 2016

Outline

Web 2.0 Concept Overview

Humans as Sensors within Web 2.0 Construct

SOURCE:18 May 2016

Application Program Interfaces (APIs)

Introduction to twitteR

Process for Creating Access Tokens

Create Application on Twitter Dev

View Created Application on Twitter Dev

Find Consumer Key Information + Generate Access Tokens

Find Access Token Information from Created Application

Connect to Twitter API from R

#Install package if you do not have in your package repository
#install.packages("twitteR") #uncomment if you require installation

#Call twitteR package
library(twitteR)

#Create access credential variables (change to your specific keys and tokens)
api_key <- YOURapi_key    #include parentheses around keys and tokens
api_secret <- YOURapi_secret
token <- YOURtoken
token_secret <- YOURtoken_secret

# Create Twitter Connection
setup_twitter_oauth(api_key, api_secret, token, token_secret) #Answer 'YES' to create connection
## [1] "Using direct authentication"

Time to Harvest Some Tweets

#Conduct search by keyword and limit to 100 responses
egypt <- searchTwitter("egyptair", n=100)

#Transform returned tweets into a dataframe structure
egypt.df <- twListToDF(egypt)

#View metadata column fields of dataframe
names(egypt.df)
##  [1] "text"          "favorited"     "favoriteCount" "replyToSN"    
##  [5] "created"       "truncated"     "replyToSID"    "id"           
##  [9] "replyToUID"    "statusSource"  "screenName"    "retweetCount" 
## [13] "isRetweet"     "retweeted"     "longitude"     "latitude"

Detailed Search Options

#Conduct search by keyword and limit to 1000 responses
egyptISIS <- searchTwitter("egyptair + ISIS", n=1000)

#Transform returned tweets into a dataframe structure
egyptISIS.df <- twListToDF(egyptISIS)

#Convert harvested tweet dataframe to a .csv file for later use
write.csv(egyptISIS.df, "EgyptAirTweets.csv", row.names=FALSE)

What Does the Conversation Look Like?

library(RCurl)
library(RJSONIO)
library(stringr)
library(tm)
library(igraph)
library(RColorBrewer)
library(httr)
library(wordcloud)

# Get text data from the result of Twitter search
text1 <- sapply(egyptISIS, function(x) x$getText())

#Prepare text through extensive regex operations to normalize to substantive words

#Remove retweets to avoid duplicative text
text1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text1)
#Remove mentions from text corpus (i.e. @somebody)
text1 = gsub("@\\w+", "", text1)
#Remove punctuation
text1 = gsub("[[:punct:]]", "", text1)
#Remove numbers
text1 = gsub("[[:digit:]]", "", text1)
#Remove html links
text1 = gsub("http\\w+", "", text1)
#Remove unnecessary whitespace
text1 = gsub("[ \t]{2,}", "", text1)
text1 = gsub("^\\s+|\\s+$", "", text1)


#Implementation of function to convert case to lower 
tryTolower = function(x)
{
  # create missing value
  y = NA
  # tryCatch error
  try_error = tryCatch(tolower(x), error=function(e) e)
  # if not an error
  if (!inherits(try_error, "error"))
    y = tolower(x)
  # result
  return(y)
}

# lower case using tryTolower with sapply 
text1 = sapply(text1, tryTolower)

#Establish corpus of words
text1_corpus <- Corpus(VectorSource(text1))

#Place in matrix format
tdm =TermDocumentMatrix(
  text1_corpus,
  control = list(
    removePunctuation = TRUE,
    stopwords = c(stopwords("en")),
    removeNumbers = TRUE,
    tolower = TRUE)
)


m = as.matrix(tdm)

#Establish word counts in decreasing order
word_freqs = sort(rowSums(m), decreasing = TRUE)
#Create a data frame with words and their frequencies
dm = data.frame(word = names(word_freqs), freq = word_freqs)
#Declare plot name
plotfile1 <- "EgyptISIS_wordcloud.png"
#Force creation of word cloud
wordcloud(dm$word, dm$freq, min.freq=2, random.order = FALSE, 
          colors = brewer.pal(12, "Dark2"))

#Final characteristics of cloud figure file
png(filename=plotfile1, width=740, height=740, units="px")

#NOTE: Natural language and word cloud code adapted 
#from http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/

Geolocation of Tweets

#Limit our search to a specific geographic area to assume confidence in location of those tweets

#Search for last 10000 tweets limited to 50 mile radius around Paris
egyptParis <- searchTwitter("egyptair", n=10000, geocode='48.8647,2.3490,50mi')
## [1] "Rate limited .... blocking for a minute and retrying up to 119 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 118 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 117 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 116 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 115 times ..."
#Transform returned tweets into a dataframe structure
egyptParis.df <- twListToDF(egyptParis)


#Convert harvested tweet dataframe to a .csv file for later use
write.csv(egyptParis.df, "EgyptParisTweets.csv", row.names=FALSE)

Map Verification

#Use mapping package leaflet to view geolocation results for entire dataframe
library(leaflet)

paris <- read.csv("EgyptParisTweets.csv")

names(paris)
##  [1] "text"          "favorited"     "favoriteCount" "replyToSN"    
##  [5] "created"       "truncated"     "replyToSID"    "id"           
##  [9] "replyToUID"    "statusSource"  "screenName"    "retweetCount" 
## [13] "isRetweet"     "retweeted"     "longitude"     "latitude"
#Subset dataframe to only those tweets with coordinates
egyptCoords <- paris[!is.na(paris[,15]),] 

#Declare map focused on Europe continent, with Paris as the centroid
m <- leaflet() %>% setView(lng = 2.3490, lat = 48.8647, zoom = 10)

#Add specific map type and reference data frame for coordinates to plot
m %>% addProviderTiles("CartoDB.Positron") %>% addCircles(data = egyptCoords, lat = ~ latitude, lng = ~ longitude)

Twitter Applications

Web Query Interest as a Potential Determinant

Operations Research vs. Data Science Comparison

#Access to GoogleTrends through `gtrendsR` package

#Install package if you do not already have it
#install.packages("gtrendsR")

#Declare package
library(gtrendsR)

#Create variables for your username/password authentication (MAKE SURE TO INPUT YOUR CREDENTIALS)
user <- Youruser          #Insert your google id (i.e. 'joe.smith@gmail.com')
password <- Yourpassword   #Insert your google password (i.e. 'password123')


#Establish connection to google
gconnect(user, password)

#Explore a trend
hotJobs <- gtrends(c("data science", "operations research"))

#Plot results
plot(hotJobs)

Other Web 2.0 API Options in R

Build a Comprehensive Picture through Data Fusion

Dashboard for an Area of Operations

Questions